# IV_setup.R

# Part of the replication archive for 
#
#   Bullock, John G. 2020. "Education and Attitudes toward Redistribution in
#   the United States." British Journal of Political Science 50.


# This file is called by many of the files in the "float_code" directory, 
# which create the figures and tables that appear in the article. It loads 
# helper files, recodes a few variables in the ANES and GSS data frames, and 
# performs a few other miscellaneous functions.  



library(Bullock, lib.loc = c(.libPaths(), 'packageLibrary'))  # for meanNA, qw()
library(Formula)
library(dplyr)    # for %>%, arrange(), bind_rows, group_by(), mutate(), rename(), slice()


# SET CONTRASTS FOR ORDERED FACTORS
# R's default is to use polynomial contrasts for ordered factors. 
options(
  contrasts = c(unordered = "contr.treatment", ordered = "contr.treatment"),
  digits    = 3)


# SET SUFFIX FOR IV OBJECTS THAT ARE ESTIMATED IN BATCHES
# They will end with suffices like '.IV1', '.IV2', and so on.  
objectSuffix <- '.IV'


# SOURCE HELPER FILES
# Most of the helper files aren't used by this file. They are instead used by
# the files that call this file.
source('functions/importModels.R')
source('functions/makeIVEstimationDataFrame.R')
source('functions/getNobsForMatrixOfModels.R')
source('functions/estimateModels.R')
source('functions/estTable.R')
source('functions/regTable.R')
source('functions/latexTable.R')
source('functions/latexTablePDF.R')


# LOAD DATA
# These RDS files are generated by ANES_coding.R and GSS_coding.R.
ANES.df <- readRDS('data/ANES_withMergedCSLs.RDS')
GSS.df  <- readRDS('data/GSS_withMergedCSLs.RDS')


# IMPORT THE IV MODELS INTO AN ENVIRONMENT
IVModelsEnv <- importModels(
  filename      = 'IV_models.R',
  endogVarNames = 'educ',
  check         = TRUE)


# CALCULATE MEDIAN GSS YEAR-WHEN-YOUNG
yearNormToSubtract  <- median(GSS.df$yearYoung, na.rm = TRUE)  



##############################################################################
# SET UP BASIC ANES DATA FRAME FOR ESTIMATION
##############################################################################
ANES.df$yearYoungNorm <- ANES.df$yearYoung - yearNormToSubtract  # GSS median is 1962


# HANDLE RESPONDENTS WITH MULTIPLE CASES IN ANES.DF
# First, for cases with duplicate respondents, use the mean outcome variable.
# Then take the last available row (last interview) for each respondent.
# 
# Note that the averaging means that the ANES.df variables will take on more 
# than seven values. For example, respondents who give an answer of 5 in one 
# wave and 6 in another will be coded as having a value of 5.5.  
ANES.df <- ANES.df %>%
  group_by(ID.unique) %>%
  arrange(yearInt) %>%
  mutate(
    guarantee.7pt   = meanNA(guarantee.7pt), 
    govt.health.7pt = meanNA(govt.health.7pt)) %>%
  slice(n())  # Take first row with slice(1), last row with slice(n()). 


# FURTHER PROCESSING
# Omit all cases for which the instruments are missing data.   
ANES.df <- ANES.df[!is.na(ANES.df$CA), ]

# Drop levels from yearInt.fac.  It originally had no empty levels, but the 
# code immediately above, which omits all cases for which the instruments are 
# missing data, may cause some categories to be empty.
ANES.df$yearInt.fac <- droplevels(ANES.df$yearInt.fac)



##############################################################################
# SET UP BASIC GSS DATA FRAME FOR ESTIMATION
##############################################################################
GSS.df$yearYoungNorm   <- GSS.df$yearYoung  - yearNormToSubtract  # GSS median is 1962
GSS.df$educYearsUncensored <- GSS.df$educ

# Omit all cases for which the instruments are missing data.   
GSS.df      <- GSS.df[!is.na(GSS.df$CA), ]

# Drop levels from yearInt.fac.  It originally had no empty levels, but the 
# code immediately above, which omits all cases for which the instruments are 
# missing data, may cause some categories to be empty.
GSS.df$yearInt.fac <- droplevels(GSS.df$yearInt.fac)




##############################################################################
# ADD VARIABLES TO ANES AND GSS DATA FRAMES
##############################################################################
# For consistency, use GSS median even with ANES. It doesn't affect the
# estimates.
ANES.df$yearIntNorm    <- ANES.df$yearInt   - median(GSS.df$yearInt, na.rm = TRUE)  # GSS median is 1991
GSS.df$yearIntNorm     <- GSS.df$yearInt    - median(GSS.df$yearInt, na.rm = TRUE)  # GSS median is 1991
ANES.df$yearYoungNorm  <- ANES.df$yearYoung - median(GSS.df$yearYoung, na.rm = TRUE)  # GSS median is 1962
GSS.df$yearYoungNorm   <- GSS.df$yearYoung  - median(GSS.df$yearYoung, na.rm = TRUE)  # GSS median is 1962

# VARIABLES NEEDED FOR COMPARISON OF DIFFERENT INSTRUMENT SETS
ANES.df$CA.facGeq10 <- ANES.df$CA >= 10
GSS.df$CA.facGeq10  <- GSS.df$CA >= 10
ANES.df$CL.facGeq8  <- ANES.df$CL >= 8
ANES.df$CL.facGeq9  <- ANES.df$CL >= 9
GSS.df$CL.facGeq8   <- GSS.df$CL >= 8
GSS.df$CL.facGeq9   <- GSS.df$CL >= 9



##############################################################################
# REDEFINING EDUCATION AND THE SCHOOLING-LAW INSTRUMENTS
##############################################################################
# The "Data" section of the article describes the operationalization of the 
# education variable and of the compulsory-attendance instruments.   
ANES.df$educ   <- ANES.df$yearsTo13
GSS.df$educ    <- GSS.df$yearsTo13
ANES.df$CA.fac <- cut(ANES.df$CA, c(-100, 7, 10, 100))
GSS.df$CA.fac  <- cut(GSS.df$CA,  c(-100, 7, 10, 100))

# Acemoglu-Angrist (2001) instruments. Necessary for the appendix section in 
# which I report results under other scholars' instrument sets.
ANES.df$CA.fac4 <- cut(ANES.df$CA, c(-100, 8, 9, 10, 100))
GSS.df$CA.fac4  <- cut(GSS.df$CA,  c(-100, 8, 9, 10, 100))
ANES.df$CL.fac4 <- cut(ANES.df$CL, c(-100, 6, 7,  8, 100))
GSS.df$CL.fac4  <- cut(GSS.df$CL,  c(-100, 6, 7,  8, 100))

# Oreopoulos "leaving age" variable. Necessary for the appendix section in 
# which I report results under other scholars' instrument sets.  
ANES.df$leaving_age16 <- pmin(ANES.df$work_age, ANES.df$drop_age) >= 16
GSS.df$leaving_age16  <- pmin(GSS.df$work_age,  GSS.df$drop_age)  >= 16




##############################################################################
# RESCALE OUTCOME VARIABLES
##############################################################################
GSS.df <- GSS.df %>%
  mutate_at(qw("eqwlth goveqinc helppoor welfare"), Bullock::rescale)
ANES.df <- ANES.df %>%
  ungroup() %>%
  mutate_at(qw("guarantee.7pt govt.health.7pt"), Bullock::rescale)